Using these data :
sales_train.csv test.csv sample_submission.csv items.csv item_categories.csv shops.csv
Data fields.
Attributes:
ID - an Id that represents a (Shop, Item) tuple within the test set shop_id - unique identifier of a shop item_id - unique identifier of a product item_category_id - unique identifier of item category item_cnt_day - number of products sold. You are predicting a monthly amount of this measure item_price - current price of an item date - date in format dd/mm/yyyy date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33 item_name - name of item shop_name - name of shop item_category_name - name of item category
#restart the kernel after installation
%pip install pandas numpy matplotlib seaborn plotly_express==0.4.0 --quiet
print('installation complete')
Note: you may need to restart the kernel to use updated packages. installation complete
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib
%matplotlib inline
df_items=pd.read_csv('items.csv')
df_shops=pd.read_csv('shops.csv')
df_sales=pd.read_csv('sales_train.csv')
df_itemscat=pd.read_csv('item_categories.csv')
df_test=pd.read_csv('test.csv')
print('Item_categories shape: {}'.format(df_itemscat.shape))
print('Items shape: {}'.format(df_items.shape))
print('Sales train shape: {}'.format(df_sales.shape))
print('Shops shape : {}'.format(df_shops.shape))
print('Test shape: {}'.format(df_test.shape))
Item_categories shape: (84, 2) Items shape: (22170, 3) Sales train shape: (2935849, 6) Shops shape : (60, 2) Test shape: (214200, 3)
#We can merge itemcat_df with items_df on item_category_id, We can also merge train_df with shops_df on shop id.
#Lastly merge the two dfs' on item_id
#Merging DFs
items_mdf=pd.merge(df_items,df_itemscat,on='item_category_id')
shops_mdf=pd.merge(df_shops,df_sales,on='shop_id')
train_mdf=pd.merge(items_mdf,shops_mdf,on='item_id')
#Check for duplicate rows
print('Total Rows Before Removing Duplicate : ', train_mdf.shape[0])
print('Total Duplicate Rows : ',train_mdf.duplicated().sum())
Total Rows Before Removing Duplicate : 2935849 Total Duplicate Rows : 6
#Removing dublicate
train_mdf= train_mdf[~train_mdf.duplicated()]
print('Total Rows After Removing Duplicate : ',train_mdf.shape[0])
Total Rows After Removing Duplicate : 2935843
#Identifying shops whose item_cnt_day, item_price were less than zero
negative_cnt_day=train_mdf[train_mdf['item_cnt_day']<0]['shop_id'].value_counts()
negative_item_price=train_mdf[train_mdf['item_price']<0]['shop_id'].value_counts()
#top shops where the item cnt is in negative
train_mdf[train_mdf['item_cnt_day']<0]['shop_id'].value_counts().sort_values(ascending=False).to_frame()[:20]
| shop_id | |
|---|---|
| 31 | 451 |
| 12 | 430 |
| 54 | 356 |
| 25 | 316 |
| 57 | 314 |
| 6 | 261 |
| 42 | 251 |
| 28 | 216 |
| 43 | 211 |
| 19 | 211 |
| 56 | 195 |
| 47 | 193 |
| 58 | 173 |
| 44 | 165 |
| 53 | 163 |
| 38 | 162 |
| 27 | 157 |
| 37 | 154 |
| 29 | 151 |
| 52 | 143 |
#Dealing with negative values on numerical columns
train_mdf.drop(train_mdf[train_mdf['item_cnt_day'] <0].index , inplace=True)
train_mdf.drop(train_mdf[train_mdf['item_price'] <0].index , inplace=True)
#Dealing with NA Values
train_mdf.fillna(0,inplace = True)
#Reorganizing our columns for easy readability
cols=['date','item_name','item_id', 'item_category_id', 'item_category_name','shop_name', 'shop_id','date_block_num', 'item_price',
'item_cnt_day']
train_mdf=train_mdf[cols]
train_mdf.head(5)
| date | item_name | item_id | item_category_id | item_category_name | shop_name | shop_id | date_block_num | item_price | item_cnt_day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01.09.2014 | ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D | 0 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 20 | 58.0 | 1.0 |
| 1 | 24.08.2014 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 19 | 58.0 | 1.0 |
| 2 | 12.11.2014 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 22 | 58.0 | 1.0 |
| 3 | 05.07.2014 | ***ГОЛУБАЯ ВОЛНА (Univ) D | 3 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 18 | 100.0 | 1.0 |
| 4 | 26.08.2014 | ***ГОЛУБАЯ ВОЛНА (Univ) D | 3 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 19 | 58.0 | 1.0 |
#Convert dates to timeseries
train_mdf['date']=pd.to_datetime(train_mdf['date'])
#Feature Enginerring
train_mdf['sales'] = train_mdf['item_price']*train_mdf['item_cnt_day']
train_mdf['year'] = pd.DatetimeIndex(train_mdf['date']).year
train_mdf['month'] = pd.DatetimeIndex(train_mdf['date']).month
train_mdf.set_index('date', inplace=True)
train_mdf.head(3)
| item_name | item_id | item_category_id | item_category_name | shop_name | shop_id | date_block_num | item_price | item_cnt_day | sales | year | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||
| 2014-01-09 | ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D | 0 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 20 | 58.0 | 1.0 | 58.0 | 2014 | 1 |
| 2014-08-24 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 19 | 58.0 | 1.0 | 58.0 | 2014 | 8 |
| 2014-12-11 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 | Кино - DVD | Химки ТЦ "Мега" | 54 | 22 | 58.0 | 1.0 | 58.0 | 2014 | 12 |
train_mdf.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 2928486 entries, 2014-01-09 to 2013-11-20 Data columns (total 12 columns): # Column Dtype --- ------ ----- 0 item_name object 1 item_id int64 2 item_category_id int64 3 item_category_name object 4 shop_name object 5 shop_id int64 6 date_block_num int64 7 item_price float64 8 item_cnt_day float64 9 sales float64 10 year int64 11 month int64 dtypes: float64(3), int64(6), object(3) memory usage: 290.5+ MB
train_mdf.nunique()
item_name 21804 item_id 21804 item_category_id 84 item_category_name 84 shop_name 60 shop_id 60 date_block_num 34 item_price 19977 item_cnt_day 189 sales 24090 year 3 month 12 dtype: int64
train_mdf.describe()
| item_id | item_category_id | shop_id | date_block_num | item_price | item_cnt_day | sales | year | month | |
|---|---|---|---|---|---|---|---|---|---|
| count | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 | 2.928486e+06 |
| mean | 1.020028e+04 | 4.001639e+01 | 3.300294e+01 | 1.456976e+01 | 8.894670e+02 | 1.248338e+00 | 1.164373e+03 | 2.013777e+03 | 6.432986e+00 |
| std | 6.324391e+03 | 1.709812e+01 | 1.622543e+01 | 9.422956e+00 | 1.727500e+03 | 2.619589e+00 | 5.687677e+03 | 7.684602e-01 | 3.504512e+00 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.000000e-02 | 1.000000e+00 | 7.000000e-02 | 2.013000e+03 | 1.000000e+00 |
| 25% | 4.477000e+03 | 2.800000e+01 | 2.200000e+01 | 7.000000e+00 | 2.490000e+02 | 1.000000e+00 | 2.490000e+02 | 2.013000e+03 | 3.000000e+00 |
| 50% | 9.355000e+03 | 4.000000e+01 | 3.100000e+01 | 1.400000e+01 | 3.990000e+02 | 1.000000e+00 | 4.490000e+02 | 2.014000e+03 | 6.000000e+00 |
| 75% | 1.569100e+04 | 5.500000e+01 | 4.700000e+01 | 2.300000e+01 | 9.990000e+02 | 1.000000e+00 | 1.090000e+03 | 2.014000e+03 | 9.000000e+00 |
| max | 2.216900e+04 | 8.300000e+01 | 5.900000e+01 | 3.300000e+01 | 3.079800e+05 | 2.169000e+03 | 1.829990e+06 | 2.015000e+03 | 1.200000e+01 |
train_mdf[['shop_id','item_id','item_price','item_cnt_day']].corr()
| shop_id | item_id | item_price | item_cnt_day | |
|---|---|---|---|---|
| shop_id | 1.000000 | 0.029343 | -0.023915 | -0.005325 |
| item_id | 0.029343 | 1.000000 | -0.134069 | 0.016269 |
| item_price | -0.023915 | -0.134069 | 1.000000 | 0.011937 |
| item_cnt_day | -0.005325 | 0.016269 | 0.011937 | 1.000000 |
**Checking for Outliers in numerical columns
plt.rcParams['figure.figsize']=(16,4)
plt.subplot(1,2,1)
sns.boxplot(data=train_mdf['item_cnt_day'],orient='h')
plt.title('Boxplot for Item_cnt_day')
plt.subplot(1,2,2)
sns.boxplot(data=train_mdf['item_price'],orient='h')
plt.title('Boxplot for Item_price')
plt.grid()
plt.show()
#Using IQR Method
Q1 = np.percentile(train_mdf['item_price'],25.0,interpolation = 'midpoint')
Q3 = np.percentile(train_mdf['item_price'],75.0,interpolation = 'midpoint')
IQR = Q3 - Q1
# Upper bound
upper = np.where(train_mdf['item_price'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(train_mdf['item_price'] <= (Q1-1.5*IQR))
#interquartile range
print(f'Interquartile Range of the Item Price {IQR}')
print(f'Q1 is {Q1}')
print(f'Q3 is {Q3}')
Interquartile Range of the Item Price 750.0 Q1 is 249.0 Q3 is 999.0
train_mdf=train_mdf[(train_mdf['item_price']<1000.0) & (train_mdf['item_price']>=249.0)]
#Confirming that we have removed Outliers for (item_price)
sns.boxplot(data=train_mdf['item_price'],orient='h')
plt.title('Box Plot of the Item Price')
plt.show()
#Computing the statistics of 2 important attributes
print("item price median : {:.2f}, item price mean : {:.2f}".format(train_mdf['item_price'].median(),train_mdf['item_price'].mean()))
print("item_cnt_day median: {:.2f},item_cnt_day mean {:.2f} ".format(train_mdf['item_cnt_day'].median(),train_mdf['item_cnt_day'].mean()))
item price median : 399.00, item price mean : 509.36 item_cnt_day median: 1.00,item_cnt_day mean 1.18
#Distribution of Item Prices
sns.displot(train_mdf['item_price'])
plt.title(' Item_Price Distribution')
plt.show()
#Distribution for item_cnt_day(amount sold)
fig = px.histogram(train_mdf, x="item_cnt_day",nbins=1,title='item_cnt_day distribution')
fig.show()
Clearly Item_cnt_day follows a uniform distribution
Identifying Top shops and Item Categories
top_10items=train_mdf.groupby('item_category_name')['item_cnt_day'].sum().sort_values(ascending=False)[0:10].plot(kind='bar',color='g',edgecolor='Black')
plt.title('top 10 categories')
plt.ylabel('Sales')
plt.show()
top_10shops=train_mdf.groupby('shop_name')['item_cnt_day'].sum().sort_values(ascending=False)[0:10].plot(kind='bar',color='c',edgecolor='Black')
plt.title('top 10 Shops')
plt.ylabel('Sales')
plt.show()
**Time series analysis
#trend analysis of Target variable(item_cnt_day)
train_mdf['item_cnt_day'].resample('M').agg(['sum','max','std']).plot(subplots=True,figsize=(15,15))
plt.xlabel('month',size=10)
plt.ylabel('item count',size=10)
plt.show()
What are our findings from EDA?
We can clearly see that month is an important feature and could be key to the patterns formed over the 3 years.However, we will confirm this in our featureimortance during modelling
%pip install scikit-learn --upgrade --quiet
print('installation complete')
Note: you may need to restart the kernel to use updated packages. installation complete
train_mdf1=train_mdf.copy()
#Looking at the shape of test set
df_test
| ID | shop_id | item_id | |
|---|---|---|---|
| 0 | 0 | 5 | 5037 |
| 1 | 1 | 5 | 5320 |
| 2 | 2 | 5 | 5233 |
| 3 | 3 | 5 | 5232 |
| 4 | 4 | 5 | 5268 |
| ... | ... | ... | ... |
| 214195 | 214195 | 45 | 18454 |
| 214196 | 214196 | 45 | 16188 |
| 214197 | 214197 | 45 | 15757 |
| 214198 | 214198 | 45 | 19648 |
| 214199 | 214199 | 45 | 969 |
214200 rows × 3 columns
#reorganizing our train test factoring our test set
monthly_trainset = train_mdf1.groupby(['year','month','shop_id','item_id'])['item_cnt_day'].sum().reset_index()
monthly_trainset.head
<bound method NDFrame.head of year month shop_id item_id item_cnt_day 0 2013 1 0 33 2.0 1 2013 1 0 98 13.0 2 2013 1 0 491 1.0 3 2013 1 0 944 1.0 4 2013 1 0 1067 1.0 ... ... ... ... ... ... 952707 2015 12 59 20017 1.0 952708 2015 12 59 20093 1.0 952709 2015 12 59 20627 1.0 952710 2015 12 59 20866 1.0 952711 2015 12 59 21669 1.0 [952712 rows x 5 columns]>
sns.heatmap(monthly_trainset.corr(), annot=True)
<AxesSubplot:>
#**What does the heat map tell us?**
#Understanding the composition of our training set
plt.title('No. of Rows per Year')
sns.countplot(x=monthly_trainset.year);
#identifying shop_id and item_id in test dataset
shops_in_test = df_test['shop_id'].unique()
items_in_test = df_test['item_id'].unique()
monthly_trainset= monthly_trainset[monthly_trainset['shop_id'].isin(shops_in_test)]
monthly_trainset = monthly_trainset[monthly_trainset['item_id'].isin(items_in_test)]
monthly_trainset.shape
(363532, 5)
#splitting to get train,validation sets
train_df = monthly_trainset[monthly_trainset['year']<2015]
val_df= monthly_trainset[monthly_trainset['year']==2015]
print('train_df shape:',train_df.shape)
print('val_df shape:',val_df.shape)
train_df shape: (215059, 5) val_df shape: (148473, 5)
train_df.columns
Index(['year', 'month', 'shop_id', 'item_id', 'item_cnt_day'], dtype='object')
input_cols=['year', 'month', 'shop_id', 'item_id']
target_col='item_cnt_day'
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()
train_inputs.shape
(215059, 4)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
model=RandomForestRegressor()
model.fit(train_inputs,train_targets)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
importance_df = pd.DataFrame({
'feature': train_inputs.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
importance_df
| feature | importance | |
|---|---|---|
| 3 | item_id | 0.440937 |
| 1 | month | 0.273866 |
| 2 | shop_id | 0.247130 |
| 0 | year | 0.038067 |
plt.title('Random Forest Feature Importance')
sns.barplot(data=importance_df, x='importance', y='feature')
<AxesSubplot:title={'center':'Random Forest Feature Importance'}, xlabel='importance', ylabel='feature'>
#predicting using training inputs
train_pred=model.predict(train_inputs)
#calculating mean_squared_error for train set
MSE=mean_squared_error(train_targets,train_pred)
#calculating residual_mean_squared_error for train_set
RMSE=np.sqrt(MSE)
#calculating coefficient of determination for train_set
cod=r2_score(train_targets,train_pred)
print(f'mean_squared_error for train_set: {MSE}')
print(f'residual_mean_squared_error for train_set: {RMSE}')
print(f'coefficient_of_determination for train_set: {cod}')
mean_squared_error for train_set: 1.0776016976736615 residual_mean_squared_error for train_set: 1.0380759594912414 coefficient_of_determination for train_set: 0.9333396216257198
#Predicting our validation set
val_pred = model.predict(val_inputs)
#calculating mean_squared_error for train set
MSE=mean_squared_error(val_targets,val_pred)
#calculating residual_mean_squared_error for train_set
RMSE=np.sqrt(MSE)
#calculating coefficient of determination for train_set
cod=r2_score(val_targets,val_pred)
print(f'mean_squared_error for val_set: {MSE}')
print(f'residual_mean_squared_error for val_set: {RMSE}')
print(f'coefficient_of_determination for val_set: {cod}')
mean_squared_error for val_set: 11.976527450782298 residual_mean_squared_error for val_set: 3.4607119861066593 coefficient_of_determination for val_set: -0.6029401055831087
#***Plotting scatter plot for the train and validation sets
plt.scatter(train_targets, train_pred)
plt.xlabel('Actual targets')
plt.ylabel('Predicted targets')
plt.title('Sales Prediction for train set')
Text(0.5, 1.0, 'Sales Prediction for train set')
Our model has traces of overfitting.
plt.scatter(val_targets,val_pred)
plt.xlabel('Actual targets')
plt.ylabel('Predicted targets')
plt.title('Sales Prediction for validation set')
Text(0.5, 1.0, 'Sales Prediction for validation set')
**What are our findings?
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
#First we create the base model
base_model = RandomForestRegressor()
base_model.fit(train_inputs, train_targets)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
#We then create a parameter grid to sample from during fitting:
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
'max_depth': [10, 19, 28, 37, 46, 55, 64, 73, 82, 91, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [10, 42, 74, 106, 138, 171, 203, 235, 267, 300]}
#we then instantiate the random search and fit by:
#using the random grid to search for best hyperparameters;
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 3,
cv = 3, verbose=2, random_state=42, n_jobs = -1)
%%time
# Fit the random search model
model_random.fit(train_inputs, train_targets)
Fitting 3 folds for each of 3 candidates, totalling 9 fits Wall time: 4min 38s
RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=3, n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 19, 28, 37, 46, 55,
64, 73, 82, 91, 100,
None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [10, 42, 74, 106, 138,
171, 203, 235, 267,
300]},
random_state=42, verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=3, n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 19, 28, 37, 46, 55,
64, 73, 82, 91, 100,
None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [10, 42, 74, 106, 138,
171, 203, 235, 267,
300]},
random_state=42, verbose=2)RandomForestRegressor()
RandomForestRegressor()
NB: The most important arguments in RandomizedSearchCV are:
1. **n_iter** - which controls the number of different combinations to try
2. **cv** - which is the number of folds to use for cross validation
# to view the best parameters for fitting random search:
model_random.best_params_
{'n_estimators': 74,
'min_samples_split': 10,
'min_samples_leaf': 4,
'max_features': 'sqrt',
'max_depth': 82,
'bootstrap': False}
# Function to Evaluate Random Search;
def evaluate(model, val_inputs, val_targets):
predictions = model.predict(val_inputs)
errors = abs(predictions - val_targets)
mape = 100 * np.mean(errors / val_targets)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
return accuracy
#we compare the base model with the best random search model
base_accuracy = evaluate(base_model, val_inputs, val_targets)
Model Performance Average Error: 1.1679 degrees. Accuracy = 31.36%.
#Our best random search model
best_random = model_random.best_estimator_
random_accuracy = evaluate(best_random,val_inputs, val_targets)
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))
Model Performance Average Error: 1.1565 degrees. Accuracy = 27.24%. Improvement of -13.15%.
test_df = df_test.copy()
test_df['year'] = '2015'
test_df['month'] = '11'
test_predict = best_random.predict(test_df[['year','month','shop_id','item_id']])
test_predict_df = pd.DataFrame(test_predict)
complete_test_df = pd.merge(test_df, test_predict_df, left_index=True, right_index=True)
complete_test_df = complete_test_df.rename(columns={0:'item_cnt_month'})
complete_test_df = complete_test_df[['ID','item_cnt_month']]
complete_test_df
| ID | item_cnt_month | |
|---|---|---|
| 0 | 0 | 1.198423 |
| 1 | 1 | 3.308301 |
| 2 | 2 | 1.206221 |
| 3 | 3 | 1.206221 |
| 4 | 4 | 1.350174 |
| ... | ... | ... |
| 214195 | 214195 | 1.291962 |
| 214196 | 214196 | 1.269600 |
| 214197 | 214197 | 1.007561 |
| 214198 | 214198 | 1.231093 |
| 214199 | 214199 | 1.115374 |
214200 rows × 2 columns
%pip install xgboost --upgrade --quiet
print('complete installation')
Note: you may need to restart the kernel to use updated packages. complete installation
# Importing the GBM from XGBoost.
from xgboost import XGBRegressor
model1 = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)
#fit the model to the training set.
model1.fit(train_inputs,train_targets)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=20, n_jobs=-1,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=20, n_jobs=-1,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, ...)#predict on train inputs
train_preds = model1.predict(train_inputs)
train_preds
array([1.387304 , 1.8411933, 1.5611197, ..., 2.1790385, 2.1790385,
2.0614622], dtype=float32)
#predict on val inputs
val_preds = model1.predict(val_inputs)
val_preds
array([1.669473 , 1.3234278, 1.6533157, ..., 2.2237637, 2.2237637,
2.5664053], dtype=float32)
**Model Evaluation
from sklearn.metrics import mean_squared_error
def gini_impurity(predictions, targets):
return mean_squared_error(predictions, targets, squared=False)
gini_impurity(train_preds,train_targets)
3.4833058319355925
gini_impurity(val_preds,val_targets)
2.5365870449067662
importance_df = pd.DataFrame({
'feature': train_inputs.columns,
'importance': model1.feature_importances_
}).sort_values('importance', ascending=False)
importance_df
| feature | importance | |
|---|---|---|
| 3 | item_id | 0.479729 |
| 1 | month | 0.216687 |
| 0 | year | 0.161804 |
| 2 | shop_id | 0.141779 |
plt.title('XGBOOST Feature Importance')
sns.barplot(data=importance_df, x='importance', y='feature')
<AxesSubplot:title={'center':'XGBOOST Feature Importance'}, xlabel='importance', ylabel='feature'>
#Hyperparameter Tuning
#We are going to adjust for; n_estimators, max_depth, eta, subsample, colsample_bytree
def test_params(**params):
modeltest = XGBRegressor(random_state=42, n_jobs=-1, **params).fit(train_inputs,train_targets)
return modeltest.score(train_inputs, train_targets), modeltest.score(val_inputs,val_targets)
#Adjusting n_estimators
test_params(n_estimators=100)
(0.5795718639001649, 0.012540289346551714)
test_params(n_estimators=600)
(0.7433775891879633, -0.2243542411030397)
#Adjusting for max_depth
test_params(max_depth=12)
(0.9208263164360713, -0.3740706425698608)
test_params(max_depth=6)
(0.5795718639001649, 0.012540289346551714)
test_params(n_estimators=100, max_depth=7, eta=0.1, subsample=1.0, colsample_bytree=1.0)
(0.5034255417296931, 0.11708801763848953)
test_params(n_estimators=50, max_depth=7, eta=0.1, subsample=1.0, colsample_bytree=1.0)
(0.42877263301225244, 0.15252548420995693)
#Predicting using XGBoost
model_adj=XGBRegressor(random_state=42, n_jobs=-1,n_estimators=50, max_depth=7, eta=0.1, subsample=1.0, colsample_bytree=1.0).fit(train_inputs,train_targets)
model_adj=XGBRegressor(random_state=42, n_jobs=-1,n_estimators=50, max_depth=7, eta=0.1, subsample=1.0, colsample_bytree=1.0).fit(train_inputs,train_targets)
test_df = df_test.copy()
test_df['year'] = '2015'
test_df['month'] = '11'
test_df=test_df[['year','month','shop_id','item_id']]
model_adj.fit(train_inputs,train_targets)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
early_stopping_rounds=None, enable_categorical=False, eta=0.1,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=-1,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
early_stopping_rounds=None, enable_categorical=False, eta=0.1,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=-1,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, ...)